package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.StringUtils;
import com.kdtech.utils.HtmlCleaner;

/**
 * 苏州新闻网解析
 * @author Persh
 *
 */
public class SubaonetNewsAnalyse implements AnalyseNews {
	
	private static final String[] regex={
		"http://.*.subaonet.com/[0-9]{4}/[0-9]{4}/[0-9]*.shtml",
		"http://.*.subaonet.com/.*/[0-9]{4}-[0-9]{2}-[0-9]{2}/[0-9]*.html",
		"http://.*.subaonet.com/[0-9]{4}/[0-9]*/[0-9]*.html",
		"http://hyy.subaonet.com/.*/[0-9]*.html",
		"http://.*.subaonet.com/.*/[0-9]{4}/[0-9]{4}/[0-9]*.shtml",
		"http://.*.szqcw.com/html/.*/[0-9]{8}/[0-9]*_[0-9]*.html",
		"http://.*.subaonet.com/[0-9]{4}/[0-9]*/[0-9]*.shtml"};
	
	
	
	public boolean isDetailPage(String url) {
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return false;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta news=new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}		
		String htmltxt=urlMeta.getHtml();
		String url=urlMeta.getUrl();		
		if(!isDetailPage(url)){
		}				
		news.setUrl(url);		
		String title=null;
		String content=null;
		Long date=null;
		Document doc=Jsoup.parse(htmltxt);
			title =doc.select("div.content-wrap>h1").text();
			if(title==null || title.trim().length()==0){
				title =doc.select("li#biao_z").text();
				if(title==null || title.trim().length()==0){
					title =doc.select("div.article_box>h2").text();
					if(title==null || title.trim().length()==0){
						title =doc.select("div.nr_nr>h1").text();
						if(title==null || title.trim().length()==0){
							title =doc.select("h3.zxfztitle").text();
							if(title==null || title.trim().length()==0){
								title =doc.select("div.gall-title>h1").text();
								if(title==null || title.trim().length()==0){
									title =doc.select("h1.article-title").text();
									if(title==null || title.trim().length()==0){
										title =doc.select("h1").text();
										if(title==null || title.trim().length()==0){
										}
									}
								}
							}
						}
					}
				}
			}
			content=HtmlCleaner.getContentHtml(url,doc.select("div#ctrlfscont.cont-detail"));
			if(content==null || content.trim().length()==0){
				content=HtmlCleaner.getContentHtml(url,doc.select("li#fontzoom"));
				if(content==null || content.trim().length()==0){
					content=HtmlCleaner.getContentHtml(url,doc.select("div.articleInfo"));
					if(content==null || content.trim().length()==0){
						content=HtmlCleaner.getContentHtml(url,doc.select("div.nr_nr"));
						if(content==null || content.trim().length()==0){
							content=HtmlCleaner.getContentHtml(url,doc.select("div#zoom.zxfzcontent"));
							if(content==null || content.trim().length()==0){
								content=HtmlCleaner.getContentHtml(url,doc.select("div.gallery-inner-wrap"));
								if(content==null || content.trim().length()==0){
									content=HtmlCleaner.getContentHtml(url,doc.select("div.article-content"));
								}
							}
						}
					}
				}
			}
			String tempdate=doc.select("div.arti-atttibute>span").text();
			if(tempdate==null || tempdate.trim().length()==0){
				tempdate=doc.select("li#xiao_z").text();
				if(tempdate==null || tempdate.trim().length()==0){
					tempdate=doc.select("div.a_info>span").text();
					if(tempdate==null || tempdate.trim().length()==0){
						tempdate=doc.select("div.nr_zz>span").text();
						if(tempdate==null || tempdate.trim().length()==0){
							tempdate=doc.select("div.zxinfo").text();
							if(tempdate==null || tempdate.trim().length()==0){
								tempdate=doc.select("div.gall-title>span").text();
								if(tempdate==null || tempdate.trim().length()==0){
									tempdate=doc.select("div.article-infos").text();
								}
							}
						}
					}
				}
			}
			date=DateUtils.matchDate(tempdate);
			if(date ==null){
				date=DateUtils.matchDate(url);
			}
			String commentNumstr=doc.select("span.comment-count").text();
			if(commentNumstr!=null && commentNumstr.trim().length()!=0){
				try {
					commentNumstr=commentNumstr.substring(commentNumstr.indexOf("有")+1, commentNumstr.indexOf("人"));
					Integer commentNum=0;
					commentNum=Integer.parseInt(commentNumstr);
					news.setCommentNum(commentNum);
				} catch (Exception e) {
					// TODO: handle exception
				}				
			}
			news.setTitle(StringUtils.trimSpace(title));
			news.setContent(content);
			news.setDate(date);	
			news.setUpdateUrl(url);
		
			String author=doc.select("span.source").text();
			if(author!=null && author.indexOf("来源：")!=-1){
				author=StringUtils.substringAfter(author, "来源：");				
			}
			news.setAuthor(author);
		return news;
	}
	
	public static void main(String[] args) {
		SubaonetNewsAnalyse analyse=new SubaonetNewsAnalyse();
		String url="http://news.subaonet.com/2014/0401/1310468.shtml";
		UrlMeta meta=CrawlHTML.responseToURL(url);
		System.out.println(analyse.parserHtml(meta));
	}

	
	public NewsMeta Update(NewsMeta meta) {
		if(meta!=null){
			String updateUrl=meta.getUpdateUrl();
			if(updateUrl!=null){
				UrlMeta responseToURL=CrawlHTML.responseToURL(updateUrl);
				if(responseToURL!=null){
					String htmltxt=responseToURL.getHtml();
					Document doc=Jsoup.parse(htmltxt);
					String commentNumstr=doc.select("span.comment-count").text();
					if(commentNumstr!=null && commentNumstr.trim().length()!=0){
						try {
							commentNumstr=commentNumstr.substring(commentNumstr.indexOf("有")+1, commentNumstr.indexOf("人"));
							Integer commentNum=0;
							commentNum=Integer.parseInt(commentNumstr);
							meta.setCommentNum(commentNum);
						} catch (Exception e) {
							// TODO: handle exception
						}				
					}
					return meta;
				}
			}
		}	
		return null;
	}

	
	public boolean isNeedUpdate(){
		return true;
	}
}
